//	Roast+ License

//	SIMD

#ifndef __SFJP_OPENMGL_roast_simd_core_HPP__
#define __SFJP_OPENMGL_roast_simd_core_HPP__

#include <stdio.h>
#include <memory.h>
#include "roast_pp.h"

#include <nmmintrin.h>   // MMX-SSE4.2߃ZbggpꍇCN[h //
#include <smmintrin.h>   // MMX-SSE4.1߃ZbggpꍇCN[h //
#include <intrin.h>      // MMX-SSE3߃ZbggpꍇCN[h   //
#include <emmintrin.h>   // MMX-SSE2߃ZbggpꍇCN[h   //
#include <xmmintrin.h>   // MMX-SSE߃ZbggpꍇCN[h    //
#include <mmintrin.h>    // MMX߃ZbggpꍇCN[h        //


//#define _ROAST_SIMD__ENABLE_FAST_INLINE

#define _ROAST_SIMD_XMM_MAX		(15)
#define _ROAST_SIMD_XMM_MAX_	(15)


#define _ROAST_SIMD__ALIGN_16				__declspec(align(16))
#define _ROAST_SIMD__ALIGN_16_FLOAT			_ROAST_SIMD__ALIGN_16 float
#define _ROAST_SIMD__ALIGN_16_CFLOAT		_ROAST_SIMD__ALIGN_16 const float
#define _ROAST_SIMD__ALIGN_16_CONST_FLOAT	_ROAST_SIMD__ALIGN_16_CFLOAT
#define _ROAST_SIMD__ALIGN_16_PVOID			void*
#define _ROAST_SIMD__NOAL_PDOUBLE			double*



/*	SSE Memory Type (128bit)	*/
typedef union
	__declspec(intrin_type)	//	__declspec(intrin_type)ĉHiׂĂ݂hh炵
	_ROAST_SIMD__ALIGN_16
{
	float               float_x4[4];

	__int8              char_x16[16];
	__int16             short_x8[8];
	__int32             int_x4[4];
	__int64             int64_x2[2];
	__int64             longlong_x2[2];
	unsigned __int8     byte_x16[16];
	unsigned __int16    ushort_x8[8];
	unsigned __int32    uint_x4[4];
	unsigned __int64    uint64_x2[2];
	unsigned __int64    ulonglong_x2[2];

	////////////////////////////////////////////

	float               f_x4[4];

	__int8              c_x16[16];
	__int16             s_x8[8];
	__int32             i_x4[4];
	__int64             i64_x2[2];
	__int64             ll_x2[2];
	unsigned __int8     b_x16[16];
	unsigned __int16    us_x8[8];
	unsigned __int32    ui_x4[4];
	unsigned __int64    ui64_x2[2];
	unsigned __int64    ull_x2[2];

	////////////////////////////////////////////

	float               f32[4];
	unsigned __int64    u64[2];
	__int8              i8[16];
	__int16             i16[8];
	__int32             i32[4];
	__int64             i64[2];
	unsigned __int8     u8[16];
	unsigned __int16    u16[8];
	unsigned __int32    u32[4];
/*
	float               m128_f32[4];
	unsigned __int64    m128_u64[2];
	__int8              m128_i8[16];
	__int16             m128_i16[8];
	__int32             m128_i32[4];
	__int64             m128_i64[2];
	unsigned __int8     m128_u8[16];
	unsigned __int16    m128_u16[8];
	unsigned __int32    m128_u32[4];
*/
} ROAST_SIMD_M128;



/* ********** Mnemonics ********************** */

/*	Move Mnemonics  */
#define ROAST_SIMD__MOVAPS(XMM_NUM,AL16FARY)		__asm movaps xmm ## XMM_NUM, xmmword ptr [AL16FARY]
#define ROAST_SIMD__MOVAPS_MX(AL16FARY,XMM_NUM)		__asm movaps [AL16FARY], xmm ## XMM_NUM

#define ROAST_SIMD__MOVDQA(TO_XMM_NUM,FROM)			__asm movdqa xmm ## TO_XMM_NUM, FROM
#define ROAST_SIMD__MOVUPD(TO_XMM_NUM,FROM)			__asm movupd xmm ## TO_XMM_NUM, FROM

/*	Calculation Mnemonics  */
//#define ROAST_SIMD__MULPS_XM(XMM_NUM,M128VAL)		 __asm mulps xmm ## XMM_NUM, xmmword ptr [M128VAL]
#define ROAST_SIMD__MULPS_XM(TO_XMM_NUM, FROM_M128VAL)		__asm mulps xmm ## TO_XMM_NUM, FROM_M128VAL
#define ROAST_SIMD__MULPS   (TO_XMM_NUM, FROM_M128VAL)		ROAST_SIMD__MULPS_XM(TO_XMM_NUM, FROM_M128VAL)

#define ROAST_SIMD__MULPS_XX(TO_XMM_NUM, FROM_XMM_NUM)		__asm mulps xmm ## TO_XMM_NUM, xmm ## FROM_XMM_NUM



/* ##### Generate Macros ###### */
//#define _ROAST_SIMD__NEMONI__XMM_M128(NEMONI,XMM_NUM,M128VAL)	

#define _ROAST_SIMD__XMM_M128__CALLBACK(XMM_NUM,_NEMONI,M128VAL)		\
	else if ( xmm_num == XMM_NUM ){ _NEMONI(XMM_NUM, M128VAL); }

#define _ROAST_SIMD__NEMONI__XMM_M128(XMM_NUM,NEMONI,M128VAL)	\
	ROAST_PP_REPEART_INC_PARAM2(XMM_NUM,_ROAST_SIMD__XMM_M128__CALLBACK,NEMONI,M128VAL)


#define _ROAST_SIMD__XMM1_XMM2__CALLBACK(XMM_NUM2,_NEMONI,XMM_NUM1)		\
	else if ( xmm_num2 == XMM_NUM2 ){ _NEMONI(XMM_NUM1, XMM_NUM2); }

#define _ROAST_SIMD__XMM_XMM__CALLBACK(XMM_NUM1,_NEMONI,XMM_NUM2)		\
	else if ( xmm_num1 == XMM_NUM1 ){	\
		if ( 1 == 0 ){}	\
		ROAST_PP_REPEART_INC_PARAM2(XMM_NUM,_ROAST_SIMD__XMM1_XMM2__CALLBACK,_NEMONI,M128VAL)	\
	}




/* ######### Internal ########## */

//#	define ROAST_SIMD_CLASS_OR_NS				class
#	define ROAST_SIMD_CLASS_OR_NS				namespace
#	define ROAST_SIMD_STATIC
#	define ROAST_SIMD_2ND_NAMESPACE_START(X)	ROAST_SIMD_CLASS_OR_NS X {
#	define ROAST_SIMD_2ND_NAMESPACE_END			}
//#	define ROAST_SIMD_2ND_NAMESPACE_START(X)
//#	define ROAST_SIMD_2ND_NAMESPACE_END

/* Internal End. */



//////////////////////////////////////////////////////////////////////////////////////////////////////

/*	namespace roast	*/
namespace roast{

	/*	namespace simd	*/
	ROAST_SIMD_CLASS_OR_NS simd
	{

		//	########  Definitations  ####################

		//typedef ROAST_SIMD_M128 m128;	/*	SSE Memory Type (128bit)	*/
		typedef ROAST_SIMD_M128 m128;	/*	SSE Memory Type (128bit)	*/

		typedef __m128 _ROAST_SIMD_USE_M128;


		//////////////////////////////////////////////////////////////////


		//	########  MMX  ####################

		/*	܂܂ƂɎC͂ȂłEEE	*/




		//	########  SSE  ####################
		ROAST_SIMD_2ND_NAMESPACE_START(sse)


		/*	4̋EςfloatzXMMxixxmm_numjɊi[	*/
#define ___ROAST_SIMD__NEMONI_			movaps
#define ___ROAST_SIMD__VAL_TYPE_		const float*
#define ___ROAST_SIMD__VAL_NAME_		f_array_4
#define ___ROAST_SIMD__VAL_IS_POINTER	
#include "simd_core__xmm_m128"

#define ___ROAST_SIMD__NEMONI_			movaps
#define ___ROAST_SIMD__VAL_TYPE_		_ROAST_SIMD_USE_M128
#define ___ROAST_SIMD__VAL_NAME_		m128val
#include "simd_core__xmm_m128"

/*
		//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
		inline ROAST_SIMD_STATIC void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[])
		{
			__asm mov edx, [f_array_4]

			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MOVAPS(0, edx);
				break;
			case 1: ROAST_SIMD__MOVAPS(1, edx);
				break;
			case 2: ROAST_SIMD__MOVAPS(2, edx);
				break;
			case 3: ROAST_SIMD__MOVAPS(3, edx);
				break;
			case 4: ROAST_SIMD__MOVAPS(4, edx);
				break;
			}
		}
*/

		/*	4floatzXMMxixxmm_numjɊi[	*/
		//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
/*		inline ROAST_SIMD_STATIC void movaps(int xmm_num, __m128 m128val)
		{
			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MOVAPS(0, m128val);
				break;
			case 1: ROAST_SIMD__MOVAPS(1, m128val);
				break;
			case 2: ROAST_SIMD__MOVAPS(2, m128val);
				break;
			case 3: ROAST_SIMD__MOVAPS(3, m128val);
				break;
			case 4: ROAST_SIMD__MOVAPS(4, m128val);
				break;
			}
		}*/

		/*	4floatzXMMxixxmm_numjɊi[	*/
		inline ROAST_SIMD_STATIC void mulps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[])
		{
			__asm mov edx, [f_array_4]

			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MULPS_XM(0, edx);
				break;
			case 1: ROAST_SIMD__MULPS_XM(1, edx);
				break;
			case 2: ROAST_SIMD__MULPS_XM(2, edx);
				break;
			case 3: ROAST_SIMD__MULPS_XM(3, edx);
				break;
			case 4: ROAST_SIMD__MULPS_XM(4, edx);
				break;
			}
		}

		/*	EEEEEE
		template <int xmm_n1, int xmm_n2>
			inline void mulps_2()
		{
			ROAST_SIMD__MULPS_XX(xmm_n1, xmm_n2);
		}
		*/
		
		/*	4̃pbNhPx_lZ	*/
		inline ROAST_SIMD_STATIC void mulps(int xmm_num, __m128 m128val)
		{
			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MULPS_XM(0, m128val);
				break;
			case 1: ROAST_SIMD__MULPS_XM(1, m128val);
				break;
			case 2: ROAST_SIMD__MULPS_XM(2, m128val);
				break;
			case 3: ROAST_SIMD__MULPS_XM(3, m128val);
				break;
			case 4: ROAST_SIMD__MULPS_XM(4, m128val);
				break;
			}
		}

		/*	4floatzXMMxixxmm_numjɊi[	*/
		//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
		inline ROAST_SIMD_STATIC void movaps(__m128 &m128val, int xmm_num)
		{
			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MOVAPS_MX(m128val, 0);
				break;
			case 1: ROAST_SIMD__MOVAPS_MX(m128val, 1);
				break;
			case 2: ROAST_SIMD__MOVAPS_MX(m128val, 2);
				break;
			case 3: ROAST_SIMD__MOVAPS_MX(m128val, 3);
				break;
			case 4: ROAST_SIMD__MOVAPS_MX(m128val, 4);
				break;
			}
		}

		/*	4floatzXMMxixxmm_numjɊi[	*/
		//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
		inline ROAST_SIMD_STATIC __m128 get_movaps(int xmm_num)
		{
			__m128 m128val;

			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MOVAPS_MX(m128val, 0);
				break;
			case 1: ROAST_SIMD__MOVAPS_MX(m128val, 1);
				break;
			case 2: ROAST_SIMD__MOVAPS_MX(m128val, 2);
				break;
			case 3: ROAST_SIMD__MOVAPS_MX(m128val, 3);
				break;
			case 4: ROAST_SIMD__MOVAPS_MX(m128val, 4);
				break;
			}

			return m128val;
		}

		/*	4floatzXMMxixxmm_numjɊi[	*/
		//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
		inline ROAST_SIMD_STATIC void movaps(float *f_array_4, int xmm_num)
		{
			__asm mov edx, [f_array_4]

			switch( xmm_num )
			{
			case 0: ROAST_SIMD__MOVAPS_MX(edx, 0);
				break;
			case 1: ROAST_SIMD__MOVAPS_MX(edx, 1);
				break;
			case 2: ROAST_SIMD__MOVAPS_MX(edx, 2);
				break;
			case 3: ROAST_SIMD__MOVAPS_MX(edx, 3);
				break;
			case 4: ROAST_SIMD__MOVAPS_MX(edx, 4);
				break;
			}
		}


		//	########  SSE2  ####################
		ROAST_SIMD_2ND_NAMESPACE_END
		ROAST_SIMD_2ND_NAMESPACE_START(sse2)


		/*	ACg̍_uNbh[h]	*/
		inline ROAST_SIMD_STATIC void movdqa(int xmm_num, _ROAST_SIMD__ALIGN_16_PVOID p)
		{
			switch( xmm_num )
			{
			case 1: ROAST_SIMD__MOVDQA(1, p);
				break;
			case 2: ROAST_SIMD__MOVDQA(2, p);
				break;
			case 3: ROAST_SIMD__MOVDQA(3, p);
				break;
			case 4: ROAST_SIMD__MOVDQA(4, p);
				break;
			}
		}


		/*	2̃pbNh{x_lXMMWX^m̊ԁA܂XMMWX^
			ACg̍ĂȂƂ̊Ԃœ]	*/
		inline ROAST_SIMD_STATIC void movupd(int xmm_num, _ROAST_SIMD__NOAL_PDOUBLE p_double)
		{
			switch( xmm_num )
			{
			case 1: ROAST_SIMD__MOVUPD(1, p_double);
				break;
			case 2: ROAST_SIMD__MOVUPD(2, p_double);
				break;
			case 3: ROAST_SIMD__MOVUPD(3, p_double);
				break;
			case 4: ROAST_SIMD__MOVUPD(4, p_double);
				break;
			}
		}

		//	########  SSE3  ####################
		ROAST_SIMD_2ND_NAMESPACE_END
		ROAST_SIMD_2ND_NAMESPACE_START(sse3)



		//	########  SSE4  ####################
		ROAST_SIMD_2ND_NAMESPACE_END
		ROAST_SIMD_2ND_NAMESPACE_START(sse4)



		/*
		//	########  SSE4.1  ####################
		ROAST_SIMD_2ND_NAMESPACE_END
		ROAST_SIMD_2ND_NAMESPACE_START(sse41)

		ROAST_SIMD_2ND_NAMESPACE_END
		*/


		//	########  SSE4.2  ####################
		ROAST_SIMD_2ND_NAMESPACE_END
		ROAST_SIMD_2ND_NAMESPACE_START(sse42)



		ROAST_SIMD_2ND_NAMESPACE_END
	};
}

#endif//__SFJP_OPENMGL_roast_simd_core_HPP__
